pacman::p_load(dplyr, tidyr, ggplot2, ggthemes,maps,d3heatmap,plotly,googleVis,tidytext,topicmodels)
customer = read.csv('./brazilian-ecommerce/olist_customers_dataset.csv')
location = read.csv('./brazilian-ecommerce/olist_geolocation_dataset.csv')
review = read.csv('./brazilian-ecommerce/olist_order_reviews_dataset_translated.csv',stringsAsFactors = F)
payment = read.csv('./brazilian-ecommerce/olist_order_payments_dataset.csv')
order = read.csv('./brazilian-ecommerce/olist_orders_dataset.csv')
product = read.csv('./brazilian-ecommerce/olist_products_dataset.csv')
seller = read.csv('./brazilian-ecommerce/olist_sellers_dataset.csv')
item = read.csv('./brazilian-ecommerce/olist_order_items_dataset.csv')
English = read.csv('./brazilian-ecommerce/product_category_name_translation.csv')

done = left_join(product, English)
## Joining, by = "product_category_name"
## Warning: Column `product_category_name` joining factors with different
## levels, coercing to character vector

最小2016-10-02

最大2018-08-31

review$review_creation_date = as.Date(review$review_creation_date)
review$review_score = as.factor(review$review_score)

clean_data =  review %>% filter(!is.na(.$review_score) & !is.na(.$review_creation_date)) %>% 
  mutate(review_month=format(review_creation_date,"%m")) %>% 
  group_by(review_month,review_score) %>% summarise(count = n()) %>% 
  group_by(review_month) %>% mutate(sum_month=sum(count),avg_count = count/sum_month)

依照月份顯示評論分佈

ggplot(clean_data, aes(x =review_month,y=avg_count,fill=review_score)) +
  geom_bar(position = "stack",stat = "identity") +
  labs(fill='顧客評分',x="月份",y="比例") +
  theme(text = element_text(family="LiGothicMed"))

clean_df = review %>% filter(translate_message !="")

# clean_title_df = filter(review,translate_title)

text_df = tibble(doc_id=clean_df$review_id,text=clean_df$translate_message)
text_df
## # A tibble: 41,744 x 2
##    doc_id                  text                                            
##    <chr>                   <chr>                                           
##  1 e64fb393e7b32834bb789f… Received well before the deadline.              
##  2 f7c4243c7fe1938f181bec… Congratulations lannister stores loved buying o…
##  3 8670d52e15e00043ae7de4… efficient apparatus. on the site the brand of t…
##  4 4b49719c8a200003f700d3… But a little, locking ... for the value ta Boa. 
##  5 3948b09f7c818e2d86c9a5… Reliable seller, ok product and delivery before…
##  6 9314d6f9799f5bfba510cc… I WOULD LIKE TO KNOW WHAT I HAVE, I ALWAYS RECE…
##  7 373cbeecea8286a2b66c97… Terrible                                        
##  8 d21bbc789670eab777d273… Shop note 10                                    
##  9 0e0190b9db53b689b285d3… Thank you for listening.                        
## 10 fe3db7c069d694bab50cc4… The purchase was made easily. The delivery was …
## # … with 41,734 more rows
token_df = text_df %>%
  unnest_tokens(word, text) %>%
  group_by(word) %>% 
  mutate(count=n())

data("stop_words")

a =token_df %>% anti_join(stop_words)
## Joining, by = "word"
devotion_dtm <-a %>% cast_dtm(doc_id, word, count)
devotion_dtm
## <<DocumentTermMatrix (documents: 38674, terms: 11511)>>
## Non-/sparse entries: 199143/444977271
## Sparsity           : 100%
## Maximal term length: 240
## Weighting          : term frequency (tf)
devotion_lda <- LDA(devotion_dtm, k = 4, control = list(seed = 1234))

devotion_topics_removed <- tidy(devotion_lda, matrix = "beta")
devotion_topics_removed
## # A tibble: 46,044 x 3
##    topic term                beta
##    <int> <chr>              <dbl>
##  1     1 received        1.83e- 1
##  2     2 received        8.76e- 6
##  3     3 received        2.72e- 5
##  4     4 received        2.66e-10
##  5     1 deadline        2.19e- 1
##  6     2 deadline        5.51e-13
##  7     3 deadline        1.23e-11
##  8     4 deadline        2.24e-10
##  9     1 congratulations 4.14e- 3
## 10     2 congratulations 1.77e- 3
## # … with 46,034 more rows
devotion_topics_removed %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

review = read.csv('./brazilian-ecommerce/olist_order_reviews_dataset.csv',stringsAsFactors = F)
Brazil<-map_data("world")%>%filter(region=="Brazil")

selllocation<-location %>% group_by(geolocation_city) %>% summarise(selllat = max(geolocation_lat),selllng=max(geolocation_lng))
custlocation<-location %>% group_by(geolocation_city) %>% summarise(custlat = max(geolocation_lat),custlng=max(geolocation_lng))

MergedData<-merge(item,seller,by.x="seller_id",by.y="seller_id")
CustOrd<-merge(order,customer,by.x="customer_id",by.y="customer_id")
custsellord<-merge(CustOrd,MergedData,by="order_id")
custsellordprod<-merge(custsellord,product,by="product_id")
complete<-merge(custsellordprod,payment,by="order_id")
complete1<-merge(complete,selllocation,by.x="seller_city",by.y="geolocation_city")
complete2<-merge(complete1,custlocation,by.x="customer_city",by.y="geolocation_city")

location2 = merge(location, seller, by.x="geolocation_zip_code_prefix", by.y="seller_zip_code_prefix")

custlocation = location2 %>% group_by(geolocation_city) %>% summarise(custlat = max(geolocation_lat),custlng=max(geolocation_lng))

#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
geo = location[location$geolocation_lat <= 5.27438888,]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
geo = location[location$geolocation_lng >= -73.98283055,]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
geo = location[location$geolocation_lat >= -33.75116944,]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
geo = location[location$geolocation_lng <=  -34.79314722,]

#For removing the typo error.(The lats that out of Brazil's map)
complete2 = complete2[complete2$selllat <= 5.27438888,]
complete2 = complete2[complete2$custlat <= 5.27438888,]

#Just printing out the seller's locations
ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="black")+
  geom_point(data= complete2,aes(x=selllng,y=selllat,color=seller_state),size=0.2)

###top 20 buyers' locations
#For tracing out the unique customeromer id
temp1 = merge(customer, order, by.x = "customer_id") 
temp2 = merge(temp1, payment, by.x = "order_id") %>% 
  group_by(customer_unique_id) %>% 
  mutate(avgvalue = mean(payment_value)) %>% 
  arrange(desc(avgvalue))
temp3 = head(temp2, 20)
temp4 = merge(temp3, complete2, by = "customer_unique_id") %>% 
  group_by(customer_unique_id) %>% 
  filter(customer_state.y != "PA")

#For mapping out the top 20 buyers' location
ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="black")+
  geom_point(data= temp4, aes(x=custlng,y=custlat,color=customer_state.y),size=5)+
  ggtitle("Map of top 20 buyers")

C = merge(done, item, by.x = "product_id") %>% 
  group_by(seller_id) %>% 
  summarise(
    ItemsSold = n(),
  Rev = sum(price),
  noProd = n_distinct(product_id),
  avgPrice = mean(price),
  maxPrice = max(price),
  minPrice = min(price),
  avgFreight = mean(freight_value),
  avgRevProd = Rev/noProd,
  avgItemsProd = ItemsSold/noProd
  )

D = merge(C, item, by.x = "seller_id")
E = merge(D, review, by.x = "order_id") %>% 
  group_by(seller_id) %>% 
  arrange(review_score) %>% 
  summarise(
      ItemsSold = n(),
      Rev = sum(price),
      noProd = n_distinct(product_id),
      avgPrice = mean(price),
      maxPrice = max(price),
      minPrice = min(price),
      avgFreight = mean(freight_value),
      avgRevProd = Rev/noProd,
      avgItemsProd = ItemsSold/noProd,
      avgreview_score=mean(review_score)
  )

###


#Relationship between freight price and review_score.
ggplot(E, aes(x = avgreview_score, y = avgFreight )) +
  geom_point()

###

A = merge(done, item, by = "product_id")
#Preparing dataframe for the following two visualiztion chart.
B = merge(A, review, by.x = "order_id") %>% group_by(product_category_name_english) %>% 
  summarise(
    Itemsold = n(),
    noSeller = n_distinct(seller_id),
    noProd = n_distinct(product_id),
    Rev = sum(price),
    avgPrice = mean(price),
    avgRevProd = Rev / noProd,
    avgItemsProd = Itemsold/noProd,
    avgreview_score = mean(review_score),
    avgSellerRev = Rev/noSeller,
    dummy = 2018
  )
## Warning: Factor `product_category_name_english` contains implicit NA,
## consider using `forcats::fct_explicit_na`
#Interactive ggploty chart for specific category's price and noseller 
g = ggplot(B, aes(x=noSeller, y=avgPrice, size=avgRevProd, color=avgreview_score))+
  geom_point()+
  geom_text(aes(label=product_category_name_english, size = 8))
ggplotly(g)
#Bubble chart
plot( gvisMotionChart(
  B, "product_category_name_english","dummy", 
  options=list(width=800, height=600) ))
## starting httpd help server ... done

第二部分

order = na.omit(order)
a = merge(order,customer)
c = merge(a,payment)
a$order_delivered_customer_date = as.Date(a$order_delivered_customer_date)
a$order_estimated_delivery_date = as.Date(a$order_estimated_delivery_date)
a$order_purchase_timestamp = as.Date(a$order_purchase_timestamp)
a$order_approved_at = as.Date(a$order_approved_at)
a$order_delivered_carrier_date = as.Date(a$order_delivered_carrier_date)

a$estimated_actual = a$order_delivered_customer_date - a$order_estimated_delivery_date 
a = na.omit(a)

b= aggregate(a$estimated_actual, by=list(a$customer_state), mean)
ggplot(b, aes(x=Group.1, y=x)) + geom_bar(stat="identity") + 
  labs(x="customer state", y="avg time")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

e= a[,c("order_purchase_timestamp","customer_state")]
table(e$customer_state, month=format(e$order_purchase_timestamp,'%m'))
##     month
##        01   02   03   04   05   06   07   08   09   10   11   12
##   AC    8    6    4    9   10    7    9    7    5    5    5    5
##   AL   38   39   37   48   45   34   37   33   19   29   26   12
##   AM   12   16   13   19   17    8   23    9    9    3   10    6
##   AP   11    4    8    5   10    4    7    5    2    3    4    4
##   BA  252  266  322  300  356  301  389  311  167  161  243  188
##   CE   85   96  119  138  134  116  137  128   74   69  103   79
##   DF  148  191  201  180  200  216  240  226   94   98  157  129
##   ES  154  180  180  185  222  202  203  197   93  103  168  108
##   GO  162  170  188  170  219  178  189  206   84  113  153  125
##   MA   61   65   71   71   63   56   76   67   41   52   53   40
##   MG  947 1030 1199 1044 1163 1070 1079 1159  493  577  922  669
##   MS   71   75   76   58   70   76   73   59   32   33   44   34
##   MT   92   84   68   89  103   82   84   77   34   51   73   49
##   PA   79   81  102  102   75   90   93  100   41   56   69   58
##   PB   33   46   55   48   46   50   77   44   25   29   27   37
##   PE  111  141  143  146  165  132  206  166   74   83  125  101
##   PI   54   42   47   45   53   43   52   40   22   24   31   23
##   PR  433  445  490  493  514  473  509  547  176  217  365  261
##   RJ  941 1107 1234 1132 1282 1106 1243 1260  591  697 1012  746
##   RN   47   31   51   42   39   47   56   40   23   27   41   30
##   RO   23   22   29   18   25   21   26   22   16   13   17   11
##   RR    1    5    7    4    3    8    6    0    1    4    2    0
##   RS  419  461  551  477  555  519  550  589  273  261  409  278
##   SC  337  311  351  345  372  314  347  353  153  184  288  192
##   SE   23   27   39   26   19   33   39   42   16   25   26   20
##   SP 3257 3229 3937 3875 4501 4019 4255 4829 1576 1818 2898 2295
##   TO   18   27   27   32   33   26   23   28   16   13   17   14
table(e$customer_state, month=format(e$order_purchase_timestamp,'%m')) %>%
as.data.frame.matrix %>% 
  d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red')))
d = merge(a,review)
d = aggregate(d$estimated_actual, by = list(d$review_score), mean)
ggplot(d, aes(x=Group.1, y=x)) + geom_bar(stat="identity") + 
  labs(x="review score", y="avg time")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

a1 = merge(a,review)
a1 = a1 %>% group_by(customer_state,review_score) %>% summarise(
  avgtime = mean(estimated_actual),
  
)
ggplot(a1, aes(x=customer_state, y=avgtime,fill=review_score)) + geom_bar(stat="identity") + 
  labs(x="customer state", y="avg time") 
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

a3 = merge(a,item)
a3 = merge(a3,product)
a3 = merge(a3,English)
a3 = merge(a3,review)
a4 = a3[,c("product_category_name_english","estimated_actual","review_score","price")]
a5 = a4 %>% group_by(product_category_name_english) %>% summarise(
  itemsold = n(),
  avgtime = mean(estimated_actual),
  avgscore = mean(review_score),
  avgprice = mean(price)
)
ggplot(a5, aes(x = log(itemsold), y = avgtime,col=avgscore,repel = T)) +
  geom_point(aes(size=log(avgprice))) + geom_text(aes(label= product_category_name_english), size=3)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#第三部份

????? 顧客真的會看商品介紹嗎: 用P$product_description_lenght

summary(payment)  
##                              order_id      payment_sequential
##  fa65dad1b0e818e3ccc5cb0e39231352:    29   Min.   : 1.000    
##  ccf804e764ed5650cd8759557269dc13:    26   1st Qu.: 1.000    
##  285c2e15bebd4ac83635ccc563dc71f4:    22   Median : 1.000    
##  895ab968e7bb0d5659d16cd74cd1650c:    21   Mean   : 1.093    
##  ee9ca989fc93ba09a6eddc250ce01742:    19   3rd Qu.: 1.000    
##  fedcd9f7ccdc8cba3a18defedd1a5547:    19   Max.   :29.000    
##  (Other)                         :103750                     
##       payment_type   payment_installments payment_value     
##  boleto     :19784   Min.   : 0.000       Min.   :    0.00  
##  credit_card:76795   1st Qu.: 1.000       1st Qu.:   56.79  
##  debit_card : 1529   Median : 1.000       Median :  100.00  
##  not_defined:    3   Mean   : 2.853       Mean   :  154.10  
##  voucher    : 5775   3rd Qu.: 4.000       3rd Qu.:  171.84  
##                      Max.   :24.000       Max.   :13664.08  
## 

付款方式:

zz_a <- left_join(order[,c(1,2)], customer[,c(1,2,5)]) %>% left_join(payment[,c(1,3)])
## Joining, by = "customer_id"
## Joining, by = "order_id"
## Warning: Column `order_id` joining factors with different levels, coercing
## to character vector
#製作消費者地區與付款方式資料表

#依付款方式分開作圖
ggplot(zz_a, aes(x=payment_type, fill=customer_state)) + 
  xlab('付款方式')+
  ylab('人數')+
  geom_bar(position="dodge", width = 8) +
  facet_wrap(~ payment_type, scales = "free")+
  theme(text = element_text(family="LiGothicMed"))

#顧客地區分布
ggplot(zz_a, aes(x=customer_state)) + 
  xlab('地區')+
  ylab('人數')+
  ggtitle('各地區顧客人數')+
  geom_bar(position="dodge")+
  theme(text = element_text(family="LiGothicMed"))

#觀看商品詳細程度對銷售量影響

#計算各個product_id數量(銷售量)
zz_temp  = table(item$product_id) %>% as.data.frame() 
#在product加入銷售量
product = product %>% merge(zz_temp, by.x = "product_id", by.y = "Var1")
#取出銷售量、圖片數量、介紹字數欄位結合
zz_b = product[,c(1:5)] %>% merge(zz_temp, by.x = "product_id", by.y = "Var1", na.rm = T) %>% filter(Freq<10)

ggplot(zz_b, aes(product_description_lenght, product_photos_qty, col = Freq)) + 
  xlab('商品介紹字數')+
  ylab('圖片數目')+
  ggtitle('探討介紹詳細程度對銷售量影響')+
  scale_colour_continuous("數量")+
  geom_jitter(size=1) +
  geom_smooth(method="gam",se = T, col="orange", lty=2) +
  facet_wrap(~ Freq, scales = "free")+
  theme(text = element_text(family="LiGothicMed"))
## Warning: Removed 590 rows containing non-finite values (stat_smooth).
## Warning: Removed 590 rows containing missing values (geom_point).

可作為研究分期付款優惠的樣本

#原本的表格
ggplot( payment,aes(payment$payment_installments, payment$payment_value)) +    
  xlab('分期數')+
  ylab('付款金額')+
  ggtitle('付款金額與分期數的關係')+
  geom_jitter(size=0.5) +
  geom_smooth(method="gam",se = T, col="orange", lty=2)+
  theme(text = element_text(family="LiGothicMed"))

#低於500的表格
zz_c = payment %>% filter(payment_value<500)
ggplot(zz_c,aes(zz_c$payment_installments, zz_c$payment_value)) +   
  xlab('分期數')+
  ylab('付款金額')+
  ggtitle('付款金額與分期數的關係')+ 
  geom_jitter(size=0.5) +
  geom_smooth(method="gam",se = T, col="orange", lty=2)+
  theme(text = element_text(family="LiGothicMed"))

#低於200的表格
zz_d = payment %>% filter(payment_value<200)
ggplot( zz_d,aes(zz_d$payment_installments, zz_d$payment_value)) +  
  xlab('分期數')+
  ylab('付款金額')+
  ggtitle('付款金額與分期數的關係')+
  geom_jitter(size=0.5) +
  geom_smooth(method="gam",se = T, col="orange", lty=2)+
  theme(text = element_text(family="LiGothicMed"))

#seller營業額、銷售量、評價   
item$sum      = item$price + item$freight_value
item = item %>%left_join(product[,c(1,2)])
## Joining, by = "product_id"
sData = seller %>% left_join(item[,c(1,3,4,6,7,8)]) %>% left_join(review[,c(2,3)])
## Joining, by = "seller_id"
## Joining, by = "order_id"
## Warning: Column `order_id` joining factor and character vector, coercing
## into character vector
b = group_by(item, seller_id)
sale  = b %>% summarise( #各個賣家的銷售額
  total = sum(sum)       #4869f7a5dfa277a7dca6462dcf3b52b2賣家銷售額最高
)
model_seller = item[item$seller_id == "4869f7a5dfa277a7dca6462dcf3b52b2",] 
#範例賣家基本資訊(營業額、銷售量、評價、產品)


#tapply(Z$avgScore, Z$seller_state, mean) %>% sort %>% tail(20) 
#建立seller的基本資料表中的資料表 by Tony的程式碼

#  1.彙整賣家資料表
Z =  item %>% group_by(seller_id) %>% summarise(
  ItemsSold = n(),
  Rev = sum(price),                 #總收益
  noProd = n_distinct(product_id),  #產品種類(數量)
  avgPrice = mean(price),           #平均商品價格
  maxPrice = max(price),            #最大商品價格
  minPrice = min(price),            #最小商品價格
  avgFreight = mean(freight_value), #平均運費
  avgRevProd = Rev/noProd,          #平均產品收益
  avgItemsProd = ItemsSold/noProd   #種類與商品數比例
)
#  2.加入評價
X = unique(item[,c(1,4)]) %>% left_join(review[,2:3]) %>% 
  group_by(seller_id) %>% summarise(
    noReview = n(),                 #回復數量
    avgScore = mean(review_score),  #平均分數
    minScore = min(review_score),   #最小分數
    maxScore = max(review_score))   #最大分數
## Joining, by = "order_id"
## Warning: Column `order_id` joining factor and character vector, coercing
## into character vector
Z = Z %>% left_join(X)
## Joining, by = "seller_id"
Z = Z[,-1]                          #刪除第一欄位
is.na(Z) %>% colSums #確認沒有na
##    ItemsSold          Rev       noProd     avgPrice     maxPrice 
##            0            0            0            0            0 
##     minPrice   avgFreight   avgRevProd avgItemsProd     noReview 
##            0            0            0            0            0 
##     avgScore     minScore     maxScore 
##            0            0            0
#觀察群組資訊 (去掉MAX與MIN)
ZZ  = Z[-c(5,6,12,13)]
ZZ1 = scale(ZZ) %>% data.frame 
zz_dist  = dist(ZZ1, method = "euclidean")
zz_hc = hclust(zz_dist, method = 'ward.D')
#plot(hc)  #畫出圖片
zz_kg = cutree(zz_hc, k = 5) 

sapply(split(ZZ,zz_kg), colMeans) %>% round(2)
##                    1        2      3       4      5
## ItemsSold       9.90   314.20   3.38   35.72   7.28
## Rev          4526.45 35670.17 386.09 3487.03 701.57
## noProd          4.69    68.26   2.14   13.13   3.98
## avgPrice      472.63   440.54 108.86  102.08  94.90
## avgFreight     49.14    29.32  18.10   18.93  17.13
## avgRevProd    845.39  1912.76 167.25  288.37 162.92
## avgItemsProd    1.92     9.55   1.57    2.95   1.76
## noReview        9.19   282.31   2.85   31.51   6.49
## avgScore        4.04     3.99   2.10    3.98   4.72
par(cex = 0.8)
par(family="LiGothicMed")
split(ZZ1,zz_kg) %>% sapply(colMeans) %>% barplot(beside=T, col=rainbow(9)) 
legend(38,2.3, c("商品數量", "總銷售額", "商品種類數目","平均商品價格","平均運費","平均商品評價","平均賣出數量","回復數量","平均評價星等"),fill=rainbow(9),cex=0.9)